log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")
knitr::opts_chunk$set(warning=FALSE, message=FALSE, fig.width = 10, fig.height=5)

require(ggplot2)
## Loading required package: ggplot2
require(data.table)
## Loading required package: data.table
require(lme4)
## Loading required package: lme4
## Loading required package: Matrix
require(lmerTest)
## Loading required package: lmerTest
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
require(lqmm)
## Loading required package: lqmm
require(car)
## Loading required package: car
## Loading required package: carData
require(cowplot)
## Loading required package: cowplot
require(DT)
## Loading required package: DT
require(pheatmap)
## Loading required package: pheatmap
require(limma)
## Loading required package: limma
require(cluster)
## Loading required package: cluster
meta <- fread(snakemake@input[["meta"]])
meta[,PatientID_Time := paste(PatientID, Time_seq, sep ="_")]
meta <- meta[Time_seq != -1,]
clinic <- fread(snakemake@input[["clinic"]], drop=1)
meta <- merge(meta,clinic, by="PatientID_Time", all.x=TRUE, suffix = c(".meta",""))
#meta_merge <- merge(meta,clinic, all.x=TRUE)

subSys <- fread(snakemake@input[["subSysCount"]])
subSysCount <- as.matrix(subSys[,-1, with =F])
rownames(subSysCount) <- gsub(" ","_",unlist(subSys[,1]))
rxns <- fread(snakemake@input[["rxnCount"]])
rxnCount <- as.matrix(rxns[,-1,with=F ])
rownames(rxnCount) <- gsub(" ","_",unlist(rxns[,1]))
subSystems <- fread(snakemake@input[["subsysTable"]])
GPR <- fread(snakemake@input[["GPR"]])

# remove degraded samples
blcklst <- fread(snakemake@input[["sampleBlacklist"]])
blckSmpls <- blcklst[relevance < 2, Sample]

meta <- meta[!(SeqID %in% blckSmpls),]
subSysCount <- subSysCount[,!(colnames(rxnCount) %in% blckSmpls)]
rxnCount <- rxnCount[,!(colnames(rxnCount) %in% blckSmpls)]
meta <- meta[SeqID %in% colnames(rxnCount),]

lqmmModels <- fread(snakemake@input[["lqmmModels"]])
lmmModels <- fread(snakemake@input[["lmmModels"]])
rxnDistStats <- fread(snakemake@input[["rxn_stats"]])

fva <- readRDS(snakemake@input[["fva"]])
distTar <- snakemake@input[["distFiles"]]
# than we can still use a simple wilcox test for each reaction (present/absence) to test association with HB_Mayo

trxnCount <- data.table(t(rxnCount), keep.rownames=TRUE)
rxn_wilcox <- merge(meta[Tissue == "Biopsy" & !is.na(HB_Mayo_impu),], 
                    trxnCount, 
                    by.x ="SeqID",
                    by.y="rn")

rxn <- rownames(rxnCount)[apply(rxnCount,1,sum)>0]
#wcxTest <- data.frame(
#                    log2FC = rep(NA, length(rxn)),
#                    pval = rep(NA,length(rxn)),
#                    padj = rep(NA,length(rxn)),
#                    row.names = rxn)
#for(r in rxn){
#    sel <- rxn_wilcox[[r]]
#    if(sum(sel) < length(sel)-3 & sum(sel) > 3){
#        wcx <- wilcox.test(rxn_wilcox[sel, HB_Mayo_impu],
#                           rxn_wilcox[!sel,HB_Mayo_impu])
#        wcxTest[r,"pval"] <- wcx[["p.value"]]
#        wcxTest[r,"log2FC"] <- mean(log2(rxn_wilcox[sel, HB_Mayo_impu]+1)) -
#                           mean(log2(rxn_wilcox[!sel,HB_Mayo_impu]+1))
#                    }
#}
#wcxTest[["padj"]] <- p.adjust(wcxTest[["pval"]], method = "BH")
#wcxTest <- data.table(wcxTest,keep.rownames=TRUE)
#wcxTest[,absLog2FC := abs(log2FC)]
#wcxTest[,rxn_plot := factor(rn, level = rn[order(log2FC)])]

models <- list()
i <- 0 

require(doMC)
require(parallel)
require(foreach)

registerDoMC(cores = detectCores()-1)

#for(r in rxn){
#    i<-i+1
#    sel <- rxn_wilcox[[r]]
#    if(sum(sel) < length(sel)-3 & sum(sel) > 3){
#        mod <- lmer(data=rxn_wilcox, HB_Mayo_impu ~ sel + (1|PatientID))
#        sumMod <- summary(mod)[["coefficients"]]
#        models[[r]] <- cbind(rxn = r, coefficient =rownames(sumMod), as.data.frame(sumMod))
#    }
#    current <- round(i/length(rxn), 2)
#    cat(paste0(current,"  \r"))
#}

models <- foreach(r = rxn) %dopar% {
    i<-i+1
    sel <- rxn_wilcox[[r]]
    if(sum(sel) < length(sel)-3 & sum(sel) > 3){
        mod <- lmer(data=rxn_wilcox, HB_Mayo_impu ~ sel + (1|PatientID))
        sumMod <- summary(mod)[["coefficients"]]
        cbind(rxn = r, coefficient =rownames(sumMod), as.data.frame(sumMod))
    }
}
names(models) <- rxn
sel <- which(!sapply(models,is.null))
models <- models[sel]

mm <- data.table(do.call(rbind, models))
mm[, padj := p.adjust(`Pr(>|t|)`, method = "BH"), by = coefficient]

mm[coefficient != "(Intercept)" & padj < 0.05, ]
##                 rxn coefficient  Estimate Std. Error       df   t value
##   1:         5HLTDL     selTRUE -1.670913  0.4636078 266.7558 -3.604152
##   2:        ACCOACm     selTRUE -1.215800  0.4472121 246.9083 -2.718621
##   3:       ACOAD10m     selTRUE -1.103716  0.3356784 237.8064 -3.288017
##   4:        ACOAD8m     selTRUE -1.084178  0.3379266 238.3950 -3.208325
##   5:          ALOX5     selTRUE  1.468230  0.4053379 255.1258  3.622237
##  ---                                                                   
## 432:  EX_HC00250(u)     selTRUE  1.385959  0.3388535 236.7123  4.090141
## 433: EX_prostgi2(u)     selTRUE  1.302116  0.4847191 238.6614  2.686331
## 434:   EX_C02470(u)     selTRUE -2.016764  0.3696785 257.7699 -5.455454
## 435:   EX_3ivcrn(u)     selTRUE  1.080751  0.3169071 236.1712  3.410307
## 436:   EX_c51crn(u)     selTRUE -1.246189  0.3390505 231.9877 -3.675525
##          Pr(>|t|)         padj
##   1: 3.735852e-04 5.982231e-03
##   2: 7.020400e-03 4.577367e-02
##   3: 1.161637e-03 1.291853e-02
##   4: 1.518293e-03 1.568334e-02
##   5: 3.523338e-04 5.749921e-03
##  ---                          
## 432: 5.911977e-05 1.244674e-03
## 433: 7.731739e-03 4.913395e-02
## 434: 1.145226e-07 3.096602e-06
## 435: 7.630651e-04 9.071266e-03
## 436: 2.948818e-04 4.972914e-03
# creating a matrix for a venn diagram
vennMat <- matrix(FALSE,nrow=nrow(subSystems), ncol = 4, dimnames = list(rxns = subSystems[["rxns"]], test = c("presence","range_LQMM","range_LMM","distance")))
#vennMat[wcxTest[padj <= 0.05, rn], "presence"] <- TRUE
vennMat[mm[padj <= 0.05 & coefficient != "(Intercept)", rxn], "presence"] <- TRUE
vennMat[lqmmModels[padj <= 0.05, variable], "range_LQMM"] <- TRUE
vennMat[rxnDistStats[padj <= 0.05 & Variable == "HB_Mayo_impu", Rxn], "distance"] <- TRUE
vennMat[lmmModels[padj <= 0.05, variable], "range_LMM"] <- TRUE

# plotting the venn diagram
limma::vennDiagram(vennMat)

# looking into the middle of it
vennSig <- vennMat[apply(vennMat,1,sum) == ncol(vennMat),]

vennSig <- data.frame(vennSig)
setkey(subSystems, "rxns")
vennSig[["Subsystems"]] <- subSystems[rownames(vennSig),subSys]


# get the coefficients for the lmms
rxns <- rownames(vennSig)[vennSig[,"range_LMM"]]
fva_range <- fva[rxns,,"range"]
fva_range <- apply(fva_range, 2,scale)
rownames(fva_range) <- rxns
fva_range <- data.table(data.frame(t(fva_range)), keep.rownames=TRUE)
fva_range <- merge(fva_range, meta[Tissue =="Biopsy",], by.x = "rn", by.y = "SeqID")
setkey(lmmModels, "variable")
lmmModels[,coefficient := 0]
coefs <- foreach(reac = rxns) %dopar%
#for(reac in rxns)
{
    if(strsplit(reac,split = "")[[1]][1] %in% as.character(0:9)){
       reac_x <- gsub("\\((.*)\\)",".\\1.",paste0("X",reac))
    } else {
        reac_x <-gsub("\\((.*)\\)",".\\1.",reac) 
    }
    fom <- as.formula(paste(reac_x, "~ HB_Mayo_impu + (1|PatientID)"))
    mod <- lmer(data = fva_range, formula = fom)
    fixef(mod)[2]
}
lmmModels[rxns, coefficient := unlist(coefs)]

# add the metrics
setkey(mm,"rxn")
vennSig[["Presence_Coef"]] <- mm[coefficient != "(Intercept)",][rownames(vennSig), Estimate]
setkey(lqmmModels,"variable")
vennSig[["Range_coeff_LQMM"]] <- as.numeric(lqmmModels[rownames(vennSig),coefficient])
setkey(rxnDistStats,"Rxn")
vennSig[["Dist_R2"]] <- rxnDistStats[rownames(vennSig),R2]
setkey(lmmModels, "variable")
vennSig[["Range_coeff_LMM"]]<-as.numeric(lmmModels[rownames(vennSig),coefficient])

# plot the wilcoxon fold change
pp_log2FC <-ggplot(data = vennSig, aes(y = Subsystems, x = Presence_Coef, color = Subsystems)) +
    geom_vline(xintercept = 0,linetype=2)+
    geom_jitter(height=0.25, width = 0,size=2) +
    theme_bw() +
    theme(legend.position = "none")
# plot the lqmm coefficient
pp_coeff<-ggplot(data = vennSig, aes(y = Subsystems, x = as.numeric(Range_coeff_LQMM),color = Subsystems)) +
    geom_vline(xintercept = 0,linetype=2)+
    geom_jitter(height=0.25, width = 0,size=2)+
    xlab("Coefficient of LQMM model")+
    theme_bw() +
    theme(legend.position = "none",
    axis.text.y = element_blank(),
    axis.title.y = element_blank())
# plot the lmm coefficient
pp_coeff_LMM<-ggplot(data = vennSig, aes(y = Subsystems, x = as.numeric(Range_coeff_LMM),color = Subsystems)) +
    geom_vline(xintercept = 0,linetype=2)+
    geom_jitter(height=0.25, width = 0,size=2)+
    xlab("Coefficient of LMM model")+
    theme_bw() +
    theme(legend.position = "none",
    axis.text.y = element_blank(),
    axis.title.y = element_blank())
# plot the variance in distance explained by the variable tested
pp_R2 <-ggplot(data = vennSig, aes(y = Subsystems, x = Dist_R2,color =Subsystems)) +
    geom_jitter(height=0.25, width = 0,size=2)+
    xlab("Distance variance explained by HB/Mayo")+
    theme_bw() +
    theme(legend.position = "none",
    axis.text.y = element_blank(),
    axis.title.y = element_blank())

pp <- cowplot::plot_grid(pp_log2FC,pp_coeff,pp_coeff_LMM,pp_R2,ncol=4,rel_widths = c(2,1,1,1))
pp

pp_log2FCbar <-ggplot(data = vennSig[order(vennSig[["Presence_Coef"]],vennSig[["Subsystems"]]),], aes(x=1:nrow(vennSig),y = Presence_Coef, fill = Subsystems)) +
    geom_bar(stat="identity")+
    theme_bw() +
    xlab("Rxns")+
    ylab("Fold change of HB/Mayo score on presence of rxn")+
    theme(axis.text.x = element_blank())

vennSig2 <- merge(data.table(vennSig, keep.rownames =TRUE), GPR, by.x="rn",by.y="rxn",keep.x=TRUE)

# function to count the genes in a string
countGPRgenes <- function(pattern, text){
    # @text - a vector of character strings to look for the pattern
    # @pattern - a pattern to look for in the text
    # returns - a list of the same length as text with the number of unique patterns
    result <- list()
    cnt <- 0
    for(t in text){
        cnt <- cnt+1
        srch <- gregexpr(pattern, t)[[1]]
        res <- c()
        for(i in 1:length(srch)){
            srch_len <- attributes(srch)[["match.length"]][i]
            res<- c(res,substr(t,srch[i],srch[i]+srch_len-1))
        }
    result[[cnt]] <- unique(res)
    }
    return(result)
}




vennPlot <- vennSig2[,.(numberRxn = .N,
                        mean_log2FC = mean(Presence_Coef),
                        mean_coeff_lqmm= mean(Range_coeff_LQMM),
                        mean_coeff_lmm= mean(Range_coeff_LMM),
                        mean_R2 = mean(as.numeric(Dist_R2))), 
                    by = Subsystems]

# extract the number of genes involved
ngenes <- sapply(countGPRgenes("HGNC:\\d+",unlist(vennSig2[,.(n_genes = gsub("\\s","",paste(GPR,collapse=","))),by="Subsystems"][,2])), length)
vennPlot[,n_genes := ngenes]


pp <- ggplot(data = vennPlot, aes(y=mean_coeff_lmm, x = mean_log2FC, size = mean_R2, color = Subsystems)) +
    geom_point()
pp

pp_genesbar <- ggplot(data = vennPlot, aes(x=Subsystems, y=n_genes, fill = Subsystems)) +
    geom_bar(stat="identity") +
    ylab("Number of genes involved")+
    theme_bw() +
    theme(legend.position = "none",
          axis.text.x = element_blank())

pp_inset <- ggdraw() +
    draw_plot(pp_log2FCbar) +
    draw_plot(pp_genesbar, x=0.40,y=0.07, width=0.3,height=0.3)
pp_inset

This is the list of significant genes from all analysis

datatable(vennSig2)

Plotting of the association

Having a list of genes, we can now look into the relation of the ranges to the HB/Mayo score.

sig_rxn <- vennSig2[,rn]
fva_min <- reshape2::melt(fva[sig_rxn,,"min"])
fva_max <- reshape2::melt(fva[sig_rxn,,"max"])
fva_range <-reshape2::melt(fva[sig_rxn,,"range"])
fva_center <-reshape2::melt(fva[sig_rxn,,"center"])

fva_dat <- merge(fva_min,fva_max,suffix = c(".min",".max"),by=c("rxn","sample"))
fva_dat <- merge(fva_dat,fva_center,by=c("rxn","sample"))
fva_dat <- merge(fva_dat,fva_range,suffix = c(".center",".range"),by=c("rxn","sample"))


fva_dat <- reshape2::melt(fva_dat, id.vals = paste0("value.",dimnames(fva)[3]))
fva_dat[["variable"]] <- gsub("^value\\.","",fva_dat[["variable"]])

fva_dat <- merge(fva_dat, meta, by.x="sample",by.y="SeqID")
fva_dat <- merge(fva_dat, vennSig2, by.x="rxn",by.y="rn")
fva_dat <- data.table(fva_dat)


# select only those reaction which are not based on the same GPR
rxnSelection <- vennSig2[!duplicated(GPR),rn]
rdr <- vennSig2[,rn][order(vennSig2[,Subsystems])]
rdr <- split(rdr, ceiling(seq_along(rdr)/20))

# dotplots
for(rxnSelection in rdr){
fva_plot <- fva_dat[rxn %in% rxnSelection,]
pp <- ggplot(fva_plot[variable %in% c("min","max"),], aes(x=HB_Mayo_impu, y=value, color=variable)) +
    geom_jitter(height= 0, width=0.25,alpha=0.5) +
    geom_smooth()+
    facet_wrap(~rxn, scale ="free")
print(pp)
}

Clustering

Using only the significant reactions we can try to cluster the data once more.

distFiles <- untar(distTar, list=TRUE)
f <- sapply(paste0("/",vennSig2[,rn],".csv"),
            grep,
            x = distFiles,
            value=TRUE,
            fixed=TRUE)
tmpd <- file.path(tempdir(),"untar")
dir.create(tmpd)
untar(distTar, files = unlist(f),exdir = tmpd)
distMat <- lapply(file.path(tmpd,unlist(f)),read.csv,row.names=1)
distMat <- simplify2array(lapply(distMat,as.matrix))
dimnames(distMat)[[3]] <- vennSig2[,rn]
unlink(tmpd, recursive=TRUE)

distAll <- apply(distMat,1:2,mean)
distscl <- cmdscale(as.dist(distAll))

distscl <- merge(meta,data.table(distscl, keep.rownames=TRUE), by.x="SeqID", by.y="rn")

pp <- ggplot(distscl, aes(x=V1,y=V2, color = HB_Mayo_impu<5, shape = Response,label = Response)) +
             geom_text() +
             facet_wrap(~Diagnosis)
pp

pp <- ggplot(distscl, aes(x=V1,y=V2, color = as.factor(Time_seq), shape = HB_Mayo_impu<5)) +
             geom_point(size=3,alpha=0.5)
pp

pp <- ggplot(distscl, aes(x=HB_Mayo_impu, y=V1)) +
    geom_point()+
    geom_smooth()
pp

pp <- ggplot(distscl, aes(x=HB_Mayo_impu, y=V2)) +
    geom_point()+
    geom_smooth()
pp

with(distscl,cor.test(HB_Mayo_impu,V1,method="kendall"))
## 
##  Kendall's rank correlation tau
## 
## data:  HB_Mayo_impu and V1
## z = 7.6445, p-value = 2.098e-14
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.2195279
with(distscl,cor.test(HB_Mayo_impu,V2,method="kendall"))
## 
##  Kendall's rank correlation tau
## 
## data:  HB_Mayo_impu and V2
## z = -0.5743, p-value = 0.5658
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##         tau 
## -0.01649235
distclst <- pam(distAll, k = 2, diss = TRUE)
setkey(distscl, "SeqID")
distscl[names(distclst[["clustering"]]), PAMs := distclst[["clustering"]]]

pp <- ggplot(distscl, aes(x=factor(PAMs), y = HB_Mayo_impu)) +
    geom_boxplot()
pp

wilcox.test(data=distscl, HB_Mayo_impu~factor(PAMs))
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  HB_Mayo_impu by factor(PAMs)
## W = 15744, p-value = 1.509e-11
## alternative hypothesis: true location shift is not equal to 0
# do a plot of all reactions singular
for(i in 1:length(rdr)){
    rxns <-rdr[[i]]
    distRy <- distMat[,,rxns]
    dstscl <- lapply(rxns,function(x) cmdscale(as.dist(distRy[,,x])))
    names(dstscl) <- rxns
    for(reac in seq_along(dstscl)){
        dstscl[[reac]] <- cbind(data.frame(dstscl[[reac]]),rxn = names(dstscl)[reac],SeqID = rownames(dstscl[[reac]]))
    }
    dstscl <- data.table(do.call(rbind, dstscl))
    dstscls <- merge(meta, dstscl, by="SeqID")

    pp <- ggplot(dstscls, aes(x=X1,y=X2, color = HB_Mayo_impu<5, shape = Remission)) +
                 geom_point(size=3,alpha=0.5) +
                 facet_wrap(~rxn, scale = "free")
    print(pp)

    pp <- ggplot(dstscls, aes(x=HB_Mayo_impu, y=X1)) +
        geom_point()+
        geom_smooth()+
        facet_wrap(~rxn, scale = "free")
    print(pp)
    pp <- ggplot(dstscls, aes(x=HB_Mayo_impu, y=X2)) +
        geom_point()+
        geom_smooth()+
        facet_wrap(~rxn, scale = "free")
    print(pp)

}